library(tidyverse)
library(conformalInference.fd)
library(roahd)
library(lubridate)
library(mgcv)
library(mgcViz)
load("~/Documents/Nonparametric Statisics/Project/clean data/full_collisions.RData")
glimpse(full_collisions)
Rows: 2,585,717
Columns: 37
$ accident_index <chr> "200501BS00001", "200501BS00002", "200501BS0…
$ accident_year <dbl> 2005, 2005, 2005, 2005, 2005, 2005, 2005, 20…
$ accident_reference <chr> "01BS00001", "01BS00002", "01BS00003", "01BS…
$ location_easting_osgr <dbl> 525680, 524170, 524520, 526900, 528060, 5247…
$ location_northing_osgr <dbl> 178240, 181650, 182240, 177530, 179040, 1811…
$ longitude <dbl> -0.191170, -0.211708, -0.206458, -0.173862, …
$ latitude <dbl> 51.48910, 51.52007, 51.52530, 51.48244, 51.4…
$ police_force <fct> Metropolitan Police, Metropolitan Police, Me…
$ accident_severity <fct> Serious, Slight, Slight, Slight, Slight, Sli…
$ number_of_vehicles <dbl> 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1,…
$ number_of_casualties <dbl> 1, 1, 1, 1, 1, 1, 1, 2, 2, 5, 1, 1, 1, 1, 2,…
$ date <date> 2005-01-04, 2005-01-05, 2005-01-06, 2005-01…
$ day_of_week <fct> Tuesday, Wednesday, Thursday, Friday, Monday…
$ time <time> 17:42:00, 17:36:00, 00:15:00, 10:35:00, 21:…
$ local_authority_district <fct> "Kensington and Chelsea", "Kensington and Ch…
$ local_authority_ons_district <chr> "E09000020", "E09000020", "E09000020", "E090…
$ local_authority_highway <chr> "E09000020", "E09000020", "E09000020", "E090…
$ first_road_class <fct> A, B, C, A, Unclassified, Unclassified, C, A…
$ first_road_number <dbl> 3218, 450, 0, 3220, 0, 0, 0, 315, 3212, 450,…
$ road_type <fct> Single carriageway, Dual carriageway, Single…
$ speed_limit <dbl> 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, …
$ junction_detail <fct> Not at junction or within 20 metres, Crossro…
$ junction_control <fct> Data missing or out of range, Auto traffic s…
$ second_road_class <fct> Not at junction or within 20 metres, C, Not …
$ second_road_number <dbl> -1, 0, -1, -1, -1, -1, 0, -1, 304, 0, 325, 3…
$ pedestrian_crossing_human_control <fct> None within 50 metres, None within 50 metres…
$ pedestrian_crossing_physical_facilities <fct> "Zebra", "Pedestrian phase at traffic signal…
$ light_conditions <fct> Daylight, Darkness - lights lit, Darkness - …
$ weather_conditions <fct> Raining no high winds, Fine no high winds, F…
$ road_surface_conditions <fct> Wet or damp, Dry, Dry, Dry, Wet or damp, Wet…
$ special_conditions_at_site <fct> None, None, None, None, None, Oil or diesel,…
$ carriageway_hazards <fct> None, None, None, None, None, None, None, No…
$ urban_or_rural_area <fct> Urban, Urban, Urban, Urban, Urban, Urban, Ur…
$ did_police_officer_attend_scene_of_accident <fct> Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
$ trunk_road_flag <fct> Non-trunk, Non-trunk, Non-trunk, Non-trunk, …
$ lsoa_of_accident_location <chr> "E01002849", "E01002909", "E01002857", "E010…
$ datetime <dttm> 2005-01-04 17:42:00, 2005-01-05 17:36:00, 2…
we need to create the dataset in a way that they cotain all of the days and a zero if no crashes happened in that day:
all_dates <- unique(full_collisions$date)
all_police_forces <- unique(full_collisions$police_force)
full_combinations <- expand.grid(date = all_dates, police_force = all_police_forces)
crashes_per_day_police <- full_combinations %>%
left_join(full_collisions %>% group_by(date, police_force) %>% summarize(number_of_crashes = n()),
by = c("date", "police_force")) %>%
replace_na(list(number_of_crashes = 0))
`summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
glimpse(crashes_per_day_police)
Rows: 341,848
Columns: 3
$ date <date> 2005-01-04, 2005-01-05, 2005-01-06, 2005-01-07, 2005-01-10, 2005-01-1…
$ police_force <fct> Metropolitan Police, Metropolitan Police, Metropolitan Police, Metropo…
$ number_of_crashes <int> 58, 63, 64, 59, 67, 76, 86, 82, 57, 57, 85, 92, 82, 79, 64, 80, 78, 80…
crashes_per_day_police <- crashes_per_day_police %>% mutate(day_of_year = yday(date),
day = factor(wday(date)),year = year(date))
removing the 29 of february for dimensioality problems in the vectors below
crashes_per_day_police <- crashes_per_day_police %>% filter(!(month(date)==2 & day(date)==29))
train_years <- ceiling(seq(2005,2021,by = 1.5))
cal_years <- seq(2006,2021,3)
df_train <- crashes_per_day_police %>% filter(year %in% train_years)
df_cal <- crashes_per_day_police %>% filter(year %in% cal_years)
df_test <- crashes_per_day_police %>% filter(year == 2022)
starting with a gam and mixed effects
gam_train <- bam(number_of_crashes ~ day + year +
s(day_of_year,k = 53, bs = "cr") +
s(police_force, bs = "re"),
data = df_train, family=quasipoisson(), method='REML')
summary(gam_train)
Family: quasipoisson
Link function: log
Formula:
number_of_crashes ~ day + year + s(day_of_year, k = 53, bs = "cr") +
s(police_force, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 81.2959321 0.4499718 180.67 <2e-16 ***
day2 0.2543089 0.0040811 62.31 <2e-16 ***
day3 0.3038483 0.0040350 75.30 <2e-16 ***
day4 0.3109257 0.0040289 77.17 <2e-16 ***
day5 0.3146008 0.0040276 78.11 <2e-16 ***
day6 0.3942311 0.0039628 99.48 <2e-16 ***
day7 0.1876563 0.0041400 45.33 <2e-16 ***
year -0.0396915 0.0002158 -183.97 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(day_of_year) 49.18 51.6 157.6 <2e-16 ***
s(police_force) 50.99 51.0 16496.4 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R-sq.(adj) = 0.851 Deviance explained = 80.6%
-REML = 3.5021e+05 Scale est. = 1.6704 n = 208780
b <- getViz(gam_train)
print(plot(b, allTerms = T), pages = 1)
1 is sunday here
newd <- with(df_train, data.frame(day_of_year = 120,
day = 1,
year = 2022,
police_force = levels(police_force)))
p <- predict(gam_train, newd, type = "terms", se.fit = TRUE)
re <- p[["fit"]][ , "s(police_force)"]
se <- p[["se.fit"]][ , "s(police_force)"]
data <- data.frame(police_force = levels(df_train$police_force), effect = re,std.err = se)
data %>%
ggplot(aes(effect, fct_reorder(police_force, effect))) +
geom_vline(xintercept = 0, color = "gray50", lty = 2, linewidth = 1.2) +
geom_errorbar(aes(
xmin = effect - 1.96*std.err,
xmax = effect + 1.96*std.err),width = 0.5, alpha = 0.7) +
geom_point(size = 1,colour = "blue") + theme_minimal() +
labs(y = "District", x = "Random effect") + ggtitle("Random effects for each police force") +
theme(legend.position = "none",plot.title = element_text(size = 16,hjust = 0.5))
check(b)
we now need to compute the bands in a functional case
res_train <- cbind(df_train,preds = predict(gam_train, df_train, type = "response")) %>%
arrange(police_force,date) %>% mutate(absdiff = abs(number_of_crashes-preds))
res_cal <- cbind(df_cal,preds = predict(gam_train, df_cal, type = "response")) %>%
arrange(police_force,date) %>% mutate(absdiff = abs(number_of_crashes-preds))
res_test <- cbind(df_test,preds = predict(gam_train, df_test, type = "response")) %>%
arrange(police_force,date) %>% mutate(absdiff = abs(number_of_crashes-preds))
n_pol <- length(levels(crashes_per_day_police$police_force))
m = 12
l = 6
alpha = 0.1
S = matrix(nrow = n_pol,ncol = 365)
k = rep(0,n_pol)
x = 1:365
line_integral = function(x, y) {
dx = diff(x)
end = length(y)
my = (y[1:(end - 1)] + y[2:end]) / 2
sum(dx *my)
}
ceiling((m+1)*(1-alpha))
we use the largest value for gamma H1 = I1
computing the modulation functions:
for(i in 1:n_pol){
ad_train <- res_train %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_train$absdiff,byrow = F,nrow = 365)
ncm = apply(a_mat,1,max)
den = line_integral(x, ncm)
s = ncm/den
S[i,] = s
}
computing the k:
ceiling((l + 1)*(1-alpha))
we use the biggest value for the choice of k
for(i in 1:n_pol){
ad_cal <- res_cal %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_cal$absdiff,byrow = F,nrow = 365)
ncs = apply(a_mat/S[i,],1,max)
k[i] = max(ncs)
}
i <- 50
df_filtered <- res_test %>% filter(police_force == levels(police_force)[i]) %>%
arrange(day_of_year)
df_plot <- data.frame(n = df_filtered$number_of_crashes,pred = df_filtered$preds,
ymax = df_filtered$preds + k[i]*S[i,],
ymin = pmax( df_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
this is not great:
i <- 6
df_filtered <- res_test %>% filter(police_force == levels(police_force)[i]) %>%
arrange(day_of_year)
df_plot <- data.frame(n = df_filtered$number_of_crashes,pred = df_filtered$preds,
ymax = df_filtered$preds + k[i]*S[i,],
ymin = pmax( df_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
i <- 30
df_filtered <- res_test %>% filter(police_force == levels(police_force)[i]) %>%
arrange(day_of_year)
df_plot <- data.frame(n = df_filtered$number_of_crashes,pred = df_filtered$preds,
ymax = df_filtered$preds + k[i]*S[i,],
ymin = pmax( df_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
this are too big, it is due to the fact that we are using a mixd effects model, so metropolitan police will contaminate all of the data.
we can fix this by making a gam for each police force:
gams <- list()
S = matrix(nrow = n_pol,ncol = 365)
k = rep(0,n_pol)
for(i in 1:n_pol){
df_train_mid <- df_train %>% filter(police_force == levels(police_force)[i])
df_cal_mid <- df_cal %>% filter(police_force == levels(police_force)[i])
gam <- bam(number_of_crashes ~ day + year +
s(day_of_year,k = 53, bs = "cr"),
data = df_train_mid, family=quasipoisson(), method='REML')
res_train_mid <- cbind(df_train_mid,preds = predict(gam, df_train_mid, type = "response")) %>%
arrange(date) %>% mutate(absdiff = abs(number_of_crashes-preds))
res_cal_mid <- cbind(df_cal_mid,preds = predict(gam, df_cal_mid, type = "response")) %>%
arrange(date) %>% mutate(absdiff = abs(number_of_crashes-preds))
ad_train <- res_train_mid %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_train$absdiff,byrow = F,nrow = 365)
ncm = apply(a_mat,1,max)
den = line_integral(x, ncm)
s = ncm/den
S[i,] = s
ad_cal <- res_cal_mid %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_cal$absdiff,byrow = F,nrow = 365)
ncs = apply(a_mat/S[i,],1,max)
k[i] = max(ncs)
gams[[i]] <- gam
}
midlands
i <- 50
df_test_filtered <- df_test %>% filter(police_force == levels(police_force)[i])
preds <- predict(gams[[i]], df_test_filtered, type = "response")
df_test_filtered <- cbind(df_test_filtered,preds) %>% arrange(day_of_year)
df_plot <- data.frame(n = df_test_filtered$number_of_crashes,pred = df_test_filtered$preds,
ymax = df_test_filtered$preds + k[i]*S[i,],
ymin = pmax( df_test_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
London:
i <- 30
df_test_filtered <- df_test %>% filter(police_force == levels(police_force)[i])
preds <- predict(gams[[i]], df_test_filtered, type = "response")
df_test_filtered <- cbind(df_test_filtered,preds) %>% arrange(day_of_year)
df_plot <- data.frame(n = df_test_filtered$number_of_crashes,pred = df_test_filtered$preds,
ymax = df_test_filtered$preds + k[i]*S[i,],
ymin = pmax( df_test_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
the model is better but still no great
we probably need more data.
using all the available data:
full_collisioin_data <- readr::read_csv(file="dft-road-casualty-statistics-collision-1979-latest-published-year.csv")
spec(full_collisioin_data)
glimpse(full_collisioin_data)
full_collisions_all <- stats19::format_collisions(full_collisioin_data)
glimpse(full_collisions_all)
all_dates <- unique(full_collisions_all$date)
all_police_forces <- unique(full_collisions_all$police_force)
full_combinations <- expand.grid(date = all_dates, police_force = all_police_forces)
crashes_per_day_police <- full_combinations %>%
left_join(full_collisions_all %>% group_by(date, police_force) %>%
summarize(number_of_crashes = n()),
by = c("date", "police_force")) %>%
replace_na(list(number_of_crashes = 0))
`summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
glimpse(crashes_per_day_police)
Rows: 835,744
Columns: 3
$ date <date> 1979-01-18, 1979-01-01, 1979-01-02, 1979-01-03, 1979-01-04, 1979-01-0…
$ police_force <fct> Metropolitan Police, Metropolitan Police, Metropolitan Police, Metropo…
$ number_of_crashes <int> 115, 39, 51, 59, 55, 84, 96, 72, 114, 132, 168, 128, 211, 145, 103, 14…
crashes_per_day_police <- crashes_per_day_police %>% mutate(day_of_year = yday(date),
day = factor(wday(date)),year = year(date))
removing the 29 of february for dimensioality problems in the vectors below
crashes_per_day_police <- crashes_per_day_police %>% filter(!(month(date)==2 & day(date)==29))
train_years <- ceiling(seq(1979,2021,by = 1.5))
cal_years <- seq(1979,2021,3)
df_train <- crashes_per_day_police %>% filter(year %in% train_years)
df_cal <- crashes_per_day_police %>% filter(year %in% cal_years)
df_test <- crashes_per_day_police %>% filter(year == 2022)
starting with a gam and mixed effects
gam_train <- bam(number_of_crashes ~ day + year +
s(day_of_year,k = 53, bs = "cr") +
s(police_force, bs = "re"),
data = df_train, family=quasipoisson(), method='REML')
summary(gam_train)
Family: quasipoisson
Link function: log
Formula:
number_of_crashes ~ day + year + s(day_of_year, k = 53, bs = "cr") +
s(police_force, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.030e+01 1.526e-01 264.1 <2e-16 ***
day2 2.351e-01 2.333e-03 100.7 <2e-16 ***
day3 2.505e-01 2.326e-03 107.7 <2e-16 ***
day4 2.651e-01 2.319e-03 114.3 <2e-16 ***
day5 2.991e-01 2.301e-03 129.9 <2e-16 ***
day6 4.129e-01 2.248e-03 183.7 <2e-16 ***
day7 2.345e-01 2.334e-03 100.5 <2e-16 ***
year -1.929e-02 4.717e-05 -409.1 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(day_of_year) 51.09 51.96 508.8 <2e-16 ***
s(police_force) 51.00 51.00 54521.2 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R-sq.(adj) = 0.894 Deviance explained = 84.1%
-REML = 9.6883e+05 Scale est. = 1.9754 n = 550420
b <- getViz(gam_train)
print(plot(b, allTerms = T), pages = 1)
1 is sunday here
newd <- with(df_train, data.frame(day_of_year = 120,
day = 1,
year = 2022,
police_force = levels(police_force)))
p <- predict(gam_train, newd, type = "terms", se.fit = TRUE)
re <- p[["fit"]][ , "s(police_force)"]
se <- p[["se.fit"]][ , "s(police_force)"]
data <- data.frame(police_force = levels(df_train$police_force), effect = re,std.err = se)
data %>%
ggplot(aes(effect, fct_reorder(police_force, effect))) +
geom_vline(xintercept = 0, color = "gray50", lty = 2, linewidth = 1.2) +
geom_errorbar(aes(
xmin = effect - 1.96*std.err,
xmax = effect + 1.96*std.err),width = 0.5, alpha = 0.7) +
geom_point(size = 1,colour = "blue") + theme_minimal() +
labs(y = "District", x = "Random effect") + ggtitle("Random effects for each police force") +
theme(legend.position = "none",plot.title = element_text(size = 16,hjust = 0.5))
check(b)
Method: REML Optimizer: outer newton
full convergence after 7 iterations.
Gradient range [-0.1344576,0.130992]
(score 968827.9 & scale 1.975359).
Hessian positive definite, eigenvalue range [23.46106,275205.6].
Model rank = 112 / 112
Basis dimension (k) checking results. Low p-value (k-index<1) may
indicate that k is too low, especially if edf is close to k'.
k' edf k-index p-value
s(day_of_year) 52.0 51.1 0.99 0.56
s(police_force) 52.0 51.0 NA NA
we now need to compute the bands in a functional case
res_train <- cbind(df_train,preds = predict(gam_train, df_train, type = "response")) %>%
arrange(police_force,date) %>% mutate(absdiff = abs(number_of_crashes-preds))
res_cal <- cbind(df_cal,preds = predict(gam_train, df_cal, type = "response")) %>%
arrange(police_force,date) %>% mutate(absdiff = abs(number_of_crashes-preds))
res_test <- cbind(df_test,preds = predict(gam_train, df_test, type = "response")) %>%
arrange(police_force,date) %>% mutate(absdiff = abs(number_of_crashes-preds))
n_pol <- length(levels(crashes_per_day_police$police_force))
m = length(train_years)
l = length(cal_years)
alpha = 0.1
S = matrix(nrow = n_pol,ncol = 365)
k = rep(0,n_pol)
x = 1:365
line_integral = function(x, y) {
dx = diff(x)
end = length(y)
my = (y[1:(end - 1)] + y[2:end]) / 2
sum(dx *my)
}
ceiling((m+1)*(1-alpha))
[1] 27
H1 != I1 now
computing the modulation functions:
for(i in 1:n_pol){
ad_train <- res_train %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_train$absdiff,byrow = F,nrow = 365)
setcfr <- apply(a_mat,2,max)
gamma <- sort(setcfr)[ceiling((m+1)*(1-alpha))]
a_mat_filt <- a_mat[,setcfr<=gamma]
ncm = apply(a_mat_filt,1,max)
ncm_sort <- sort(ncm)
den = line_integral(x, ncm)
s = ncm/den
S[i,] = s
}
computing the k:
ceiling((l + 1)*(1-alpha))
[1] 15
we use the biggest value for the choice of k
for(i in 1:n_pol){
ad_cal <- res_cal %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_cal$absdiff,byrow = F,nrow = 365)
ncs = apply(a_mat/S[i,],1,max)
k[i] = max(ncs)
}
i <- 50
df_filtered <- res_test %>% filter(police_force == levels(police_force)[i]) %>%
arrange(day_of_year)
df_plot <- data.frame(n = df_filtered$number_of_crashes,pred = df_filtered$preds,
ymax = df_filtered$preds + k[i]*S[i,],
ymin = pmax( df_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
this is not great:
i <- 6
df_filtered <- res_test %>% filter(police_force == levels(police_force)[i]) %>%
arrange(day_of_year)
df_plot <- data.frame(n = df_filtered$number_of_crashes,pred = df_filtered$preds,
ymax = df_filtered$preds + k[i]*S[i,],
ymin = pmax( df_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
i <- 30
df_filtered <- res_test %>% filter(police_force == levels(police_force)[i]) %>%
arrange(day_of_year)
df_plot <- data.frame(n = df_filtered$number_of_crashes,pred = df_filtered$preds,
ymax = df_filtered$preds + k[i]*S[i,],
ymin = pmax( df_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
this is eve worse tha before:
doing it for a single police force at a time:
gams <- list()
S = matrix(nrow = n_pol,ncol = 365)
k = rep(0,n_pol)
for(i in 1:n_pol){
df_train_mid <- df_train %>% filter(police_force == levels(police_force)[i])
df_cal_mid <- df_cal %>% filter(police_force == levels(police_force)[i])
gam <- bam(number_of_crashes ~ day + year +
s(day_of_year,k = 53, bs = "cr"),
data = df_train_mid, family=quasipoisson(), method='REML')
res_train_mid <- cbind(df_train_mid,preds = predict(gam, df_train_mid, type = "response")) %>%
arrange(date) %>% mutate(absdiff = abs(number_of_crashes-preds))
res_cal_mid <- cbind(df_cal_mid,preds = predict(gam, df_cal_mid, type = "response")) %>%
arrange(date) %>% mutate(absdiff = abs(number_of_crashes-preds))
ad_train <- res_train_mid %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_train$absdiff,byrow = F,nrow = 365)
setcfr <- apply(a_mat,2,max)
gamma <- sort(setcfr)[ceiling((m+1)*(1-alpha))]
a_mat_filt <- a_mat[,setcfr<=gamma]
ncm = apply(a_mat_filt,1,max)
ncm_sort <- sort(ncm)
den = line_integral(x, ncm)
s = ncm/den
S[i,] = s
ad_cal <- res_cal_mid %>% filter(police_force == levels(police_force)[i]) %>%
dplyr::select(absdiff)
a_mat <- matrix(ad_cal$absdiff,byrow = F,nrow = 365)
ncs = apply(a_mat/S[i,],1,max)
k[i] = max(ncs)
gams[[i]] <- gam
}
midlands
i <- 50
df_test_filtered <- df_test %>% filter(police_force == levels(police_force)[i])
preds <- predict(gams[[i]], df_test_filtered, type = "response")
df_test_filtered <- cbind(df_test_filtered,preds) %>% arrange(day_of_year)
df_plot <- data.frame(n = df_test_filtered$number_of_crashes,pred = df_test_filtered$preds,
ymax = df_test_filtered$preds + k[i]*S[i,],
ymin = pmax( df_test_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
London:
i <- 30
df_test_filtered <- df_test %>% filter(police_force == levels(police_force)[i])
preds <- predict(gams[[i]], df_test_filtered, type = "response")
df_test_filtered <- cbind(df_test_filtered,preds) %>% arrange(day_of_year)
df_plot <- data.frame(n = df_test_filtered$number_of_crashes,pred = df_test_filtered$preds,
ymax = df_test_filtered$preds + k[i]*S[i,],
ymin = pmax( df_test_filtered$preds - k[i]*S[i,], rep(0, 365)),td = 1:365)
df_plot %>% ggplot() + geom_ribbon(aes(x=td,y = pred,ymin = ymin, ymax = ymax),
fill = "blue", alpha = 0.25) +
geom_line(aes(x=td,y = n),linewidth=1,colour = "black") +
geom_line(aes(x=td,y = pred),linewidth=1,colour = "blue") + labs(x = "Day", y = "Number of accients",
title = levels(df_test$police_force)[i])
the model is still ot graet